Chapter 3 Data quality

load("data/data.Rdata")

3.1 Load statistics

read_stats <- read_tsv("data/reports/multiqc_fastqc.txt", 
                        col_types = cols_only("Sample" = col_character(),
                                              "Total Sequences" = col_double(),
                                              "%GC" = col_double(),
                                              "total_deduplicated_percentage" = col_double())) %>%
  mutate(Sample = str_extract(Sample, "M\\d+")) %>%
  rename(microsample=Sample,total_sequences="Total Sequences",percent_gc="%GC",percent_unique=total_deduplicated_percentage) %>%
  group_by(microsample) %>%
  summarise(total_sequences=sum(total_sequences), percent_unique=mean(percent_unique), percent_gc=mean(percent_gc))
host_mapping_stats <- read_tsv("data/reports/multiqc_samtools_flagstat.txt") %>%
    mutate(reference = case_when(
        grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
        grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
        TRUE ~ NA )) %>%
    filter(reference=="chicken") %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(microsample=Sample,reads_mapped_host=mapped_passed,reads_mapped_host_percent=mapped_passed_pct) %>%
    select(microsample,reads_mapped_host,reads_mapped_host_percent) %>%
    group_by(microsample) %>%
    summarise(reads_mapped_host=sum(reads_mapped_host),reads_mapped_host_percent=mean(reads_mapped_host_percent))
human_mapping_stats <- read_tsv("data/reports/multiqc_samtools_flagstat.txt") %>%
    mutate(reference = case_when(
        grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
        grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
        TRUE ~ NA )) %>%
    filter(reference=="human") %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(microsample=Sample, reads_mapped_human=mapped_passed,reads_mapped_human_percent=mapped_passed_pct) %>%
    select(microsample,reads_mapped_human,reads_mapped_human_percent) %>%
    group_by(microsample) %>%
    summarise(reads_mapped_human=sum(reads_mapped_human),reads_mapped_human_percent=mean(reads_mapped_human_percent))
quantification_stats  <- read_tsv("data/reports/multiqc_samtools_stats.txt") %>%
   filter(str_detect(Sample, "mgg-pbdrep")) %>%
   mutate(Sample = str_extract(Sample, "M\\d+")) %>%
   rename(microsample=Sample) %>%
    group_by(microsample) %>%
    summarise(reads_mapped=sum(reads_mapped),reads_mapped_percent=mean(reads_mapped_percent))
quality_stats <- read_stats %>%
    left_join(host_mapping_stats, by=join_by(microsample==microsample)) %>%
    left_join(human_mapping_stats, by=join_by(microsample==microsample)) %>%
    left_join(quantification_stats, by=join_by(microsample==microsample))

3.2 Individual overview

3.2.1 Sequencing depth

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=total_sequences,y=microsample,fill=section))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      geom_vline(xintercept=10000000, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Number of reads", y="Microsamples", fill="Library protocol")

3.2.2 Sequence duplication

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=percent_unique,y=microsample,fill=collection))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      geom_vline(xintercept=35, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Percentage of non-duplicates", y="Microsamples", fill="Collection success")

3.2.3 GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=percent_gc,y=microsample,fill=collection))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      geom_vline(xintercept=60, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Percentage of GC", y="Microsamples", fill="Library protocol")

3.2.4 Host %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=reads_mapped_host_percent,y=microsample,fill=section))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Host %", y="Microsamples", fill="Library protocol")

3.2.5 Human %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=reads_mapped_human_percent,y=microsample,fill=section))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Human %", y="Microsamples", fill="Library protocol")

3.2.6 Bacteria mapping %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=reads_mapped_percent,y=microsample,fill=section))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      geom_vline(xintercept=75, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Mapped to MAGs (%)", y="Microsamples", fill="Library protocol")

3.2.7 Domain-adjusted mapping rate

3.3 Biplots

3.3.1 Sequencing depth vs. GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    ggplot(aes(x=percent_gc,y=total_sequences,color=section))+
      geom_point()+
      scale_color_manual(values=c("#a3d1cf","#d1a3cf")) +
      facet_nested(. ~ batch, scales="free") +
      labs(color="Sexrion")

3.3.2 Unique sequences vs. GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    ggplot(aes(x=percent_gc,y=percent_unique,color=section))+
      geom_point()+
      scale_color_manual(values=c("#a3d1cf","#d1a3cf")) +
      facet_nested(. ~ batch, scales="free")+
      labs(color="Library protocol")

3.4 Quality flagging

quality <- quality_stats %>%
    mutate(depth = case_when(
        total_sequences <= 10000000 ~ 0,
        total_sequences > 10000000 ~ 1,
        TRUE ~ NA)) %>%
    mutate(duplicates = case_when(
        percent_unique <= 35 ~ 0,
        percent_unique > 35 ~ 1,
        TRUE ~ NA)) %>%
    mutate(gc = case_when(
        percent_gc >= 60 ~ 0,
        percent_gc < 60 ~ 1,
        TRUE ~ NA)) %>%
    mutate(human = case_when(
        reads_mapped_human_percent >= 5 ~ 0,
        reads_mapped_human_percent < 5 ~ 1,
        TRUE ~ NA)) %>%
    mutate(bacteria = case_when(
        reads_mapped_percent <= 75 ~ 0,
        reads_mapped_percent > 75 ~ 1,
        TRUE ~ NA)) %>%
    select(microsample, depth, duplicates, gc, human, bacteria) %>%
    mutate(quality = depth + duplicates + gc + human + bacteria) %>%
    select(microsample, quality)

quality %>% write_tsv("results/quality.tsv")

3.4.1 Quality overview

quality %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=quality,y=microsample,fill=collection))+
      geom_col()+
      scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
      geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Quality score", y="Microsamples", fill="Collection success")

quality %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    group_by(section) %>%
    summarise(average=mean(quality, na.rm=TRUE), percentage_5 = mean(quality == 5, na.rm = TRUE) * 100) %>%
    tt()
tinytable_zxscwv09g71a7x30bd10
section average percentage_5
Caecum right 4.430556 62.5